{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Azure Translation API"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install datasets\n",
    "# !pip install fsspec==2023.9.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests, uuid, json\n",
    "\n",
    "# Add your key and endpoint\n",
    "\n",
    "# location, also known as region.\n",
    "# required if you're using a multi-service or regional (not global) resource. It can be found in the Azure portal on the Keys and Endpoint page.\n",
    "\n",
    "def translate_text(text):\n",
    "    key = \"\"\n",
    "    endpoint = \"\"\n",
    "    location = \"\"\n",
    "    path = '/translate'\n",
    "    constructed_url = endpoint + path\n",
    "\n",
    "    params = {\n",
    "        'api-version': '3.0',\n",
    "        'from': 'en',\n",
    "        'to': 'kn'\n",
    "    }\n",
    "\n",
    "    headers = {\n",
    "        'Ocp-Apim-Subscription-Key': key,\n",
    "        'Ocp-Apim-Subscription-Region': location,\n",
    "        'Content-type': 'application/json',\n",
    "        'X-ClientTraceId': str(uuid.uuid4())\n",
    "    }\n",
    "    try:\n",
    "\n",
    "        body = [{\n",
    "            'text': text\n",
    "        }]\n",
    "\n",
    "        request = requests.post(constructed_url, params=params, headers=headers, json=body)\n",
    "        response = request.json()\n",
    "        response = response[0]['translations'][0]['text']\n",
    "        return response\n",
    "        pass\n",
    "    except Exception as e:\n",
    "        print(f\"Translation failed for text: {text}\")\n",
    "        return None\n",
    "translated_response = translate_text(\"\"\"\n",
    "\\onicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the \" Nameless \" , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit \" <unk> Raven \" . \n",
    "\\\n",
    "\\enjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the \" Nameless \" , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit \" <unk> Raven \" . \n",
    "\"\"\")\n",
    "print(translated_response)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import concurrent.futures\n",
    "from datasets import load_dataset\n",
    "import random\n",
    "from tqdm import tqdm\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kannada_dataset = load_dataset(\"CognitiveLab/Project_K_TrainDataset_500k\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kannada_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "random_samples = random.sample(kannada_dataset['test']['text'], 10)\n",
    "random_samples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def translate_batch(batch):\n",
    "    with concurrent.futures.ThreadPoolExecutor() as executor:\n",
    "        english_translations = list(tqdm(executor.map(translate_text, batch), total=len(batch), desc=\"Translating\"))\n",
    "    return english_translations\n",
    "\n",
    "# Split the random samples into batches for parallel processing\n",
    "batch_size = 10\n",
    "sample_batches = [random_samples[i:i + batch_size] for i in range(0, len(random_samples), batch_size)]\n",
    "\n",
    "# Translate each batch of Kannada texts to English in parallel\n",
    "translated_data = {'src': [], 'tgt': []}\n",
    "\n",
    "for batch in tqdm(sample_batches, desc=\"Translating and Saving\"):\n",
    "    english_translations = translate_batch(batch)\n",
    "\n",
    "    for kannada_text, english_translation in zip(batch, english_translations):\n",
    "        if english_translation is not None:\n",
    "            translated_data['src'].append(kannada_text)\n",
    "            translated_data['tgt'].append(english_translation)\n",
    "\n",
    "# Create a DataFrame from the translated data\n",
    "translated_df = pd.DataFrame(translated_data)\n",
    "\n",
    "# Save the translated dataset to disk\n",
    "translated_df.to_csv('./translated_dataset.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hf_username = 'your_username'\n",
    "hf_dataset_name = 'your_translated_dataset'\n",
    "translated_dataset = Dataset.from_pandas(translated_df)\n",
    "translated_dataset.save_to_disk('./huggingface_dataset')\n",
    "translated_dataset.push_to_hub(f\"{hf_username}/{hf_dataset_name}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
